*===============================================================================
*=== FILE 2 --- IMPORT AND CLEAN FOR-RENT ZILLOW INFORMARTION
*===============================================================================
clear
set more off

cap cd ""



*===============================================================================
*=== FOR-RENT LISTING WORD PAIR COUNTS
*===============================================================================
import delimited "replication output\FourState_FR_listings_word_pairs_count_flags.csv", clear

drop if _n==1


rename v2 word_count
rename v3 pair_count
rename v4 all_words
rename v5 all_pairs

split v1, p(||) gen(original)

rename original2 propertyid
rename original3 postingid 
rename original5 rentalprice
rename original6 listingcreationdate
drop original*
drop v1


destring propertyid, force replace 
destring postingid, replace
destring rentalprice, replace


drop if missing(propertyid)
drop if missing(postingid)
drop if missing(listingcreationdate)


duplicates drop

compress

order propertyid postingid listingcreationdate word_count

gsort propertyid postingid listingcreationdate -word_count -pair_count

duplicates drop propertyid postingid listingcreationdate, force

isid propertyid postingid listingcreationdate


merge m:1 propertyid using "data\preprocessed\pseudo\FourState_facts.dta"
keep if _merge==3
drop _merge

save "replication output\FourState_FR_merged", replace

keep if word_count>0 | pair_count>0

save "replication output\FourState_FR_merged_matches", replace


*now import original split file to add textblock to matched word file
import delimited "data\pseudo\FR_listings.txt", delimiter("||", asstring) bindquote(loose) clear

rename propertyid string_fragments
rename postingid propertyid
rename postingtypeid postingid
rename rentalprice postingtypeid
rename listingcreationdate rentalprice
rename listingdescription listingcreationdate
rename v7 listingdescription


drop if postingid !=. & listingdescription ==""

*FIX THE STRINGS FIRST
destring string_fragments, gen(fragment_num) force
replace string_fragments="" if fragment_num!=.
drop fragment_num
replace string_fragments="" if  string_fragments==" "
replace string_fragments="" if  string_fragments=="  "
replace string_fragments="" if  string_fragments=="   "
replace string_fragments="" if  string_fragments=="    "
replace string_fragments="" if  string_fragments=="     "


local previous_ob postingid!=.

forvalues i=1/50 {


local previous_obs_list `previous_ob' & postingid[_n+`i']==.

replace listingdescription = listingdescription + " " + string_fragments[_n+`i'] if string_fragments[_n+`i']!="" & `previous_obs_list'

local previous_ob `previous_obs_list'

}
drop string_fragments

gen stringlength = length(listingdescription)


drop if missing(propertyid)
drop if missing(postingid)



gsort propertyid postingid listingcreationdate -stringlength

duplicates drop propertyid postingid listingcreationdate, force
isid propertyid postingid listingcreationdate
drop stringlength

merge 1:1 propertyid postingid listingcreationdate using "replication output\FourState_FR_merged_matches"
keep if _merge==3
drop _merge



save "replication output\FourState_FR_merged_matches_text", replace

erase "replication output\FourState_FR_merged_matches.dta"

